ephrin_binding¶
Analyze receptor selection data using soluble Ephrin-B2 or -B3 NOTE NEED TO FIX
- Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
# input configs
altair_config = None
nipah_config = None
# input files
entropy_file = None
func_scores_E2_file = None
binding_E2_file = None
func_scores_E3_file = None
binding_E3_file = None
# output files
filtered_E2_binding_data = None
filtered_E3_binding_data = None
filtered_E2_binding_low_effect = None
filtered_E3_binding_low_effect = None
# output images
entry_binding_combined_corr_plot = None
entry_binding_combined_corr_plot_agg = None
E2_E3_correlation = None
E2_E3_correlation_site = None
combined_E2_E3_site_corr = None
binding_by_site_plot = None
entry_binding_corr_heatmap = None
binding_corr_heatmap = None
binding_region_boxplot_plot = None
binding_region_bubble_plot = None
max_binding_in_stalk = None
max_binding_in_contact = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entropy_file = "results/entropy/entropy.csv"
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
filtered_E2_binding_low_effect = (
"results/filtered_data/E2_binding_low_effect_filter.csv"
)
filtered_E3_binding_low_effect = (
"results/filtered_data/E3_binding_low_effect_filter.csv"
)
entry_binding_combined_corr_plot = (
"results/images/entry_binding_combined_corr_plot.html"
)
entry_binding_combined_corr_plot_agg = (
"results/images/entry_binding_combined_corr_plot_agg.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
combined_E2_E3_site_corr = "results/images/combined_E2_E3_site_corr.html"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
entry_binding_corr_heatmap = "results/images/entry_binding_corr_heatmap.html"
binding_corr_heatmap = "results/images/binding_corr_heatmap.html"
binding_region_boxplot_plot = "results/images/binding_region_boxplot_plot.html"
binding_region_bubble_plot = "results/images/binding_region_bubble_plot.html"
max_binding_in_contact = "results/images/max_binding_in_contact.html"
max_binding_in_stalk = "results/images/max_binding_in_stalk.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if (
os.getcwd()
== "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
pass
print("Already in correct directory")
else:
os.chdir(
"/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
)
print("Setup in correct directory")
Setup in correct directory
hard paths for running in interactive mode¶
In [5]:
if nipah_config is None:
##hard paths in case don't want to run with snakemake
print("loading hard paths")
altair_config = "data/custom_analyses_data/theme.py"
nipah_config = "nipah_config.yaml"
entropy_file = "results/entropy/entropy.csv"
# input files
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = (
"results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
)
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
filtered_E2_binding_low_effect = (
"results/filtered_data/E2_binding_low_effect_filter.csv"
)
filtered_E3_binding_low_effect = (
"results/filtered_data/E3_binding_low_effect_filter.csv"
)
Run config files to setup altair theme and config variables¶
In [6]:
if altair_config:
with open(altair_config, "r") as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
Import the binding and entry data for EFNB2 and EFNB3¶
In [7]:
# import binding and entry data
e2 = pd.read_csv(binding_E2_file)
e2_func = pd.read_csv(func_scores_E2_file)
e3 = pd.read_csv(binding_E3_file)
e3_func = pd.read_csv(func_scores_E3_file)
Filter the data and save¶
In [8]:
def merge_func_binding_dfs(func, binding, name):
# Merge the 'binding' and 'func' DataFrames on specified columns with outer join.
# This combines data based on 'site', 'mutant', and 'wildtype' columns.
# Suffixes are added to columns with identical names in both DataFrames for differentiation.
# The result is rounded to three decimal places for neatness.
df_int = pd.merge(
binding,
func,
on=["site", "mutant", "wildtype"],
suffixes=["_binding", "_cell_entry"],
validate="one_to_one",
how="outer",
).round(3)
# Rename specific columns for clarity.
df = df_int.rename(
columns={
"Ephrin binding_mean": "binding_mean",
"Ephrin binding_std": "binding_std",
"Ephrin binding_median": "binding_median",
}
)
# Select and reorder a subset of columns for the final DataFrame.
df = df[
[
"site",
"wildtype",
"mutant",
"binding_mean",
"binding_std",
"times_seen_binding",
"effect",
"effect_std",
"times_seen_cell_entry",
"frac_models",
]
]
# Define a function to filter the DataFrame based on several criteria.
def filter_binding_data(df):
# Apply multiple conditions to filter the data for relevant entries.
df_filter = df[
(df["mutant"] != "*")
& (df["mutant"] != "-")
& (df["site"] != 603)
&
# Conditions related to cell entry effects and observations.
(df["effect"] >= config["min_func_effect_for_binding"])
& (df["times_seen_cell_entry"] >= config["func_times_seen_cutoff"])
& (df["effect_std"] <= config["func_std_cutoff"])
&
# Conditions related to binding observations and variability.
(df["times_seen_binding"] >= config["min_times_seen_binding"])
& (df["binding_std"] <= config["max_binding_std"])
& (df["frac_models"] >= config["frac_models"])
]
return df_filter
# Filter the DataFrame using the defined function.
df_filter = filter_binding_data(df)
# Define a function to identify low effect mutants for further analysis.
def store_filtered_info(df):
df_low_filter = df[
(df["mutant"] != "*")
& (df["mutant"] != "-")
& (df["site"] != 603)
& (df["effect"] < config["min_func_effect_for_binding"])
& (df["times_seen_cell_entry"] >= config["func_times_seen_cutoff"])
& (df["effect_std"] <= config["func_std_cutoff"])
]
return df_low_filter
# Filter the DataFrame to identify low effect mutants.
df_low_effect_filter = store_filtered_info(df)
# Save the filtered data to CSV files, differentiating between EFNB2 and others.
if name == "EFNB2":
print(name) # Print the name for confirmation.
df_filter.to_csv(filtered_E2_binding_data, index=False)
df_low_effect_filter.to_csv(filtered_E2_binding_low_effect, index=False)
else:
df_filter.to_csv(filtered_E3_binding_data, index=False)
df_low_effect_filter.to_csv(filtered_E3_binding_low_effect, index=False)
# Return the filtered DataFrames for further use.
return df_filter, df_low_effect_filter
# Use the defined function to filter data for EFNB2 and EFNB3.
df_E2_filter, df_E2_filter_missing = merge_func_binding_dfs(e2_func, e2, "EFNB2")
df_E3_filter, df_E3_filter_missing = merge_func_binding_dfs(e3_func, e3, "EFNB3")
# Merge the filtered EFNB2 and EFNB3 DataFrames for combined analysis.
df_binding_effect_merge = pd.merge(
df_E2_filter,
df_E3_filter,
on=["site", "wildtype", "mutant"],
suffixes=["_E2", "_E3"],
how="outer",
)
# Display descriptive statistics of the merged DataFrame.
display(df_binding_effect_merge.describe().round(3))
# Add a 'selection' column to distinguish between EFNB2 and EFNB3 data.
df_E2_filter["selection"] = "EFNB2"
df_E3_filter["selection"] = "EFNB3"
# Concatenate the DataFrames for plotting or further analysis.
df_binding_effect_concat = pd.concat([df_E2_filter, df_E3_filter])
EFNB2
| site | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7319.000 | 6805.000 | 6805.000 | 6805.000 | 6805.000 | 6805.000 | 6805.000 | 6805.000 | 6619.000 | 6619.000 | 6619.000 | 6619.000 | 6619.000 | 6619.000 | 6619.0 |
| mean | 342.744 | -0.315 | 0.507 | 6.350 | -0.110 | 0.372 | 7.631 | 0.994 | -0.018 | 0.180 | 6.158 | -0.091 | 0.390 | 6.733 | 1.0 |
| std | 148.521 | 1.105 | 0.340 | 3.208 | 0.480 | 0.195 | 4.381 | 0.050 | 0.272 | 0.173 | 3.075 | 0.447 | 0.188 | 3.591 | 0.0 |
| min | 71.000 | -5.280 | 0.005 | 2.000 | -1.498 | 0.010 | 2.000 | 0.500 | -2.147 | 0.000 | 2.000 | -1.499 | 0.033 | 2.000 | 1.0 |
| 25% | 215.000 | -0.448 | 0.273 | 4.500 | -0.366 | 0.224 | 5.000 | 1.000 | -0.150 | 0.063 | 4.000 | -0.308 | 0.249 | 4.571 | 1.0 |
| 50% | 341.000 | -0.040 | 0.427 | 5.750 | 0.012 | 0.339 | 6.750 | 1.000 | -0.012 | 0.135 | 5.500 | 0.029 | 0.357 | 6.000 | 1.0 |
| 75% | 466.000 | 0.255 | 0.651 | 7.500 | 0.257 | 0.485 | 9.000 | 1.000 | 0.118 | 0.242 | 7.500 | 0.236 | 0.500 | 7.857 | 1.0 |
| max | 602.000 | 2.160 | 2.911 | 43.750 | 0.604 | 0.994 | 72.380 | 1.000 | 2.006 | 2.957 | 45.000 | 0.615 | 1.000 | 56.710 | 1.0 |
In [9]:
# What are the top 5 highest and lowest binding mutants for EFNB2 and EFNB3?
def find_highest_lowest(df, name):
print(f"We are analyzing {name}\n")
print(f"The total number of mutants was: {df.shape[0]}\n")
tmp_df = df.sort_values(by="binding_mean")
print("These are the lowest binding mutants detected:")
display(tmp_df.head(5))
tmp_df_high = df.sort_values(by="binding_mean", ascending=False)
print("\nThese are the highest binding mutants detected:\n")
display(tmp_df_high.head(5))
# What about mutants with positive entry scores?
tmp_df = df[df["effect"] > 0].sort_values(by="binding_mean")
print("These are the lowest binding mutants detected with positive entry scores:")
display(tmp_df.head(5))
tmp_df_high = df[df["effect"] > 0].sort_values(by="binding_mean", ascending=False)
print(
"\nThese are the highest binding mutants detected with positive entry scores:\n"
)
display(tmp_df_high.head(5))
blah = df.groupby('site')['binding_mean'].sum().reset_index()
display(blah.sort_values(by='binding_mean',ascending=False))
find_highest_lowest(df_E2_filter, "EFNB2")
find_highest_lowest(df_E3_filter, "EFNB3")
We are analyzing EFNB2 The total number of mutants was: 6805 These are the lowest binding mutants detected:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 8285 | 501 | E | K | -5.280 | 0.928 | 24.50 | -0.644 | 0.606 | 32.00 | 1.0 | EFNB2 |
| 8898 | 532 | A | V | -5.259 | 1.140 | 16.25 | -0.320 | 0.312 | 17.88 | 1.0 | EFNB2 |
| 3188 | 236 | R | H | -5.054 | 0.883 | 22.00 | -1.253 | 0.335 | 29.38 | 1.0 | EFNB2 |
| 8072 | 490 | Q | K | -4.966 | 1.081 | 8.50 | -0.692 | 0.539 | 11.38 | 1.0 | EFNB2 |
| 8070 | 490 | Q | H | -4.752 | 0.862 | 8.25 | -0.450 | 0.668 | 10.88 | 1.0 | EFNB2 |
These are the highest binding mutants detected:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9544 | 566 | F | C | 2.160 | 2.119 | 5.00 | -0.533 | 0.455 | 5.500 | 0.75 | EFNB2 |
| 9830 | 580 | I | S | 2.152 | 1.768 | 4.75 | -1.001 | 0.692 | 6.375 | 1.00 | EFNB2 |
| 5007 | 331 | N | E | 2.144 | 1.233 | 3.00 | -0.887 | 0.844 | 3.500 | 1.00 | EFNB2 |
| 3820 | 269 | N | E | 2.075 | 2.241 | 5.00 | -1.048 | 0.484 | 7.875 | 1.00 | EFNB2 |
| 9948 | 586 | N | T | 1.962 | 0.578 | 5.25 | -0.816 | 0.823 | 7.000 | 1.00 | EFNB2 |
These are the lowest binding mutants detected with positive entry scores:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9402 | 558 | A | V | -4.647 | 0.730 | 17.25 | 0.118 | 0.364 | 18.50 | 1.0 | EFNB2 |
| 9369 | 557 | N | D | -4.627 | 0.399 | 9.50 | 0.385 | 0.479 | 11.12 | 1.0 | EFNB2 |
| 8773 | 526 | L | H | -4.546 | 0.782 | 5.25 | 0.110 | 0.392 | 6.75 | 1.0 | EFNB2 |
| 9853 | 581 | Y | W | -4.519 | 1.013 | 10.75 | 0.091 | 0.410 | 11.50 | 1.0 | EFNB2 |
| 8090 | 491 | S | H | -4.505 | 0.935 | 8.50 | 0.112 | 0.250 | 10.25 | 1.0 | EFNB2 |
These are the highest binding mutants detected with positive entry scores:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 4080 | 282 | C | S | 1.609 | 1.056 | 4.50 | 0.338 | 0.325 | 3.714 | 0.50 | EFNB2 |
| 7303 | 450 | Q | I | 1.457 | 1.194 | 5.00 | 0.142 | 0.399 | 6.000 | 1.00 | EFNB2 |
| 5159 | 339 | S | A | 1.389 | 2.087 | 2.00 | 0.449 | 0.149 | 2.625 | 0.75 | EFNB2 |
| 5579 | 361 | T | A | 1.371 | 2.108 | 3.50 | 0.363 | 0.450 | 4.875 | 1.00 | EFNB2 |
| 1784 | 164 | N | S | 1.297 | 1.136 | 3.75 | 0.240 | 0.291 | 6.125 | 1.00 | EFNB2 |
| site | binding_mean | |
|---|---|---|
| 250 | 331 | 15.733 |
| 473 | 564 | 15.087 |
| 97 | 172 | 13.884 |
| 225 | 306 | 13.853 |
| 493 | 586 | 13.370 |
| ... | ... | ... |
| 412 | 501 | -58.053 |
| 160 | 236 | -58.154 |
| 305 | 389 | -59.775 |
| 492 | 585 | -62.244 |
| 407 | 494 | -66.568 |
510 rows × 2 columns
We are analyzing EFNB3 The total number of mutants was: 6619 These are the lowest binding mutants detected:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9221 | 555 | D | K | -2.147 | 1.225 | 6.0 | 0.085 | 0.422 | 6.714 | 1.0 | EFNB3 |
| 9227 | 555 | D | R | -1.510 | 0.420 | 4.0 | 0.076 | 0.526 | 4.143 | 1.0 | EFNB3 |
| 8739 | 530 | Q | L | -1.447 | 0.477 | 3.5 | -1.019 | 0.862 | 5.857 | 1.0 | EFNB3 |
| 9761 | 583 | T | R | -1.410 | 0.446 | 5.5 | -1.005 | 0.485 | 6.286 | 1.0 | EFNB3 |
| 9755 | 583 | T | K | -1.247 | 0.113 | 5.0 | -0.728 | 0.644 | 7.000 | 1.0 | EFNB3 |
These are the highest binding mutants detected:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9866 | 589 | R | G | 2.006 | 0.659 | 7.0 | -0.966 | 0.576 | 8.286 | 1.0 | EFNB3 |
| 9699 | 580 | I | M | 1.337 | 0.534 | 5.5 | -0.816 | 0.531 | 6.714 | 1.0 | EFNB3 |
| 3777 | 270 | V | Q | 1.329 | 1.028 | 5.0 | -0.885 | 0.651 | 5.429 | 1.0 | EFNB3 |
| 3719 | 267 | M | Q | 1.291 | 1.371 | 4.5 | -0.798 | 0.818 | 5.667 | 1.0 | EFNB3 |
| 8010 | 492 | Q | L | 1.261 | 0.067 | 5.5 | 0.483 | 0.183 | 5.143 | 1.0 | EFNB3 |
These are the lowest binding mutants detected with positive entry scores:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9221 | 555 | D | K | -2.147 | 1.225 | 6.0 | 0.085 | 0.422 | 6.714 | 1.0 | EFNB3 |
| 9227 | 555 | D | R | -1.510 | 0.420 | 4.0 | 0.076 | 0.526 | 4.143 | 1.0 | EFNB3 |
| 8747 | 530 | Q | W | -0.997 | 0.547 | 3.0 | 0.018 | 0.665 | 3.286 | 1.0 | EFNB3 |
| 9214 | 555 | D | A | -0.830 | 0.200 | 3.5 | 0.376 | 0.234 | 3.714 | 1.0 | EFNB3 |
| 7038 | 441 | P | V | -0.746 | 0.182 | 5.5 | 0.371 | 0.086 | 5.286 | 1.0 | EFNB3 |
These are the highest binding mutants detected with positive entry scores:
| site | wildtype | mutant | binding_mean | binding_std | times_seen_binding | effect | effect_std | times_seen_cell_entry | frac_models | selection | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 8010 | 492 | Q | L | 1.261 | 0.067 | 5.5 | 0.483 | 0.183 | 5.143 | 1.0 | EFNB3 |
| 2637 | 211 | G | F | 1.169 | 0.289 | 5.0 | 0.164 | 0.572 | 5.714 | 1.0 | EFNB3 |
| 10031 | 598 | P | G | 1.154 | 1.559 | 4.5 | 0.239 | 0.515 | 5.429 | 1.0 | EFNB3 |
| 9192 | 553 | S | W | 0.994 | 0.435 | 9.5 | 0.167 | 0.406 | 9.857 | 1.0 | EFNB3 |
| 9745 | 582 | D | W | 0.976 | 0.338 | 6.5 | 0.063 | 0.310 | 7.143 | 1.0 | EFNB3 |
| site | binding_mean | |
|---|---|---|
| 490 | 589 | 11.707 |
| 87 | 161 | 7.333 |
| 469 | 564 | 6.722 |
| 225 | 306 | 6.494 |
| 59 | 132 | 6.170 |
| ... | ... | ... |
| 270 | 351 | -7.156 |
| 165 | 242 | -7.294 |
| 465 | 560 | -7.522 |
| 162 | 238 | -8.166 |
| 461 | 555 | -11.032 |
504 rows × 2 columns
Find the top overlapping binders for both EFNB2 and EFNB3¶
In [10]:
def find_good_binding_for_both(df):
e2_good = df.sort_values(by='binding_mean_E2',ascending=False)
e3_good = df.sort_values(by='binding_mean_E3',ascending=False)
e2_good = e2_good.head(30)
e3_good = e3_good.head(30)
combo = pd.merge(e2_good,e3_good,on=['site','wildtype','mutant'],how='inner')
display(combo)
find_good_binding_for_both(df_binding_effect_merge)
| site | wildtype | mutant | binding_mean_E2_x | binding_std_E2_x | times_seen_binding_E2_x | effect_E2_x | effect_std_E2_x | times_seen_cell_entry_E2_x | frac_models_E2_x | ... | effect_std_E2_y | times_seen_cell_entry_E2_y | frac_models_E2_y | binding_mean_E3_y | binding_std_E3_y | times_seen_binding_E3_y | effect_E3_y | effect_std_E3_y | times_seen_cell_entry_E3_y | frac_models_E3_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 566 | F | C | 2.160 | 2.119 | 5.00 | -0.533 | 0.455 | 5.500 | 0.75 | ... | 0.455 | 5.500 | 0.75 | 1.132 | 1.194 | 3.5 | -0.851 | 0.760 | 3.833 | 1.0 |
| 1 | 211 | G | F | 1.957 | 0.549 | 4.75 | -0.665 | 0.426 | 5.125 | 1.00 | ... | 0.426 | 5.125 | 1.00 | 1.169 | 0.289 | 5.0 | 0.164 | 0.572 | 5.714 | 1.0 |
| 2 | 553 | S | W | 1.945 | 0.287 | 10.00 | -0.297 | 0.429 | 13.250 | 1.00 | ... | 0.429 | 13.250 | 1.00 | 0.994 | 0.435 | 9.5 | 0.167 | 0.406 | 9.857 | 1.0 |
| 3 | 139 | N | W | 1.930 | 0.483 | 5.75 | -0.667 | 0.622 | 8.000 | 1.00 | ... | 0.622 | 8.000 | 1.00 | 1.003 | 0.005 | 4.5 | -0.444 | 0.445 | 6.286 | 1.0 |
4 rows × 31 columns
In [11]:
# Compare E2 and E3 binders
def find_highest_lowest(df):
df["binding_diff"] = (df["binding_mean_E2"] - df["binding_mean_E3"]).abs()
print(
"These are the mutants with the biggest difference between EFNB2 and EFNB3:\n"
)
display(df.sort_values(by="binding_diff", ascending=False).head(10))
# calculate aggregate differences
agg_df = (
df.groupby("site")[["binding_mean_E2", "binding_mean_E3", "binding_diff"]]
.median()
.reset_index()
)
print("These are the sites with the biggest difference between EFNB2 and EFNB3:\n")
display(agg_df.sort_values(by="binding_diff", ascending=False).head(5))
find_highest_lowest(df_binding_effect_merge)
# find_highest_lowest(df_E3_filter,'EFNB3')
These are the mutants with the biggest difference between EFNB2 and EFNB3:
| site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | binding_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5836 | 530 | Q | F | -4.015 | 0.466 | 6.25 | 0.467 | 0.202 | 5.375 | 1.0 | 0.084 | 0.062 | 6.0 | 0.359 | 0.274 | 6.286 | 1.0 | 4.099 |
| 5363 | 490 | Q | Y | -4.290 | 0.900 | 5.50 | -1.088 | 0.400 | 6.375 | 1.0 | -0.461 | 0.574 | 5.5 | -0.314 | 0.797 | 5.714 | 1.0 | 3.829 |
| 5368 | 491 | S | H | -4.505 | 0.935 | 8.50 | 0.112 | 0.250 | 10.250 | 1.0 | -0.872 | 0.790 | 7.0 | -1.381 | 0.486 | 8.286 | 1.0 | 3.633 |
| 5362 | 490 | Q | W | -4.205 | 1.005 | 3.75 | 0.235 | 0.378 | 4.750 | 1.0 | -0.586 | 0.323 | 3.0 | -0.151 | 0.458 | 3.286 | 1.0 | 3.619 |
| 5367 | 491 | S | G | -4.181 | 0.595 | 6.50 | 0.130 | 0.298 | 8.625 | 1.0 | -0.562 | 0.047 | 5.5 | -0.236 | 0.608 | 6.286 | 1.0 | 3.619 |
| 6628 | 588 | I | P | -2.349 | 0.840 | 5.25 | -0.256 | 0.300 | 4.750 | 1.0 | 1.183 | 0.255 | 5.5 | -0.613 | 0.382 | 5.429 | 1.0 | 3.532 |
| 5386 | 492 | Q | K | -3.523 | 0.549 | 11.00 | 0.353 | 0.232 | 11.620 | 1.0 | 0.006 | 0.077 | 10.0 | 0.329 | 0.240 | 11.570 | 1.0 | 3.529 |
| 5390 | 492 | Q | R | -2.823 | 0.611 | 19.25 | -0.077 | 0.272 | 23.120 | 1.0 | 0.644 | 0.054 | 20.0 | 0.509 | 0.119 | 20.570 | 1.0 | 3.467 |
| 1865 | 239 | S | H | -3.440 | 0.088 | 5.25 | 0.334 | 0.272 | 4.500 | 1.0 | -0.041 | 0.832 | 4.0 | -1.219 | 0.693 | 5.000 | 1.0 | 3.399 |
| 5845 | 530 | Q | V | -4.327 | 0.838 | 6.00 | 0.106 | 0.269 | 8.625 | 1.0 | -0.987 | 0.896 | 5.5 | -1.443 | 0.739 | 7.000 | 1.0 | 3.340 |
These are the sites with the biggest difference between EFNB2 and EFNB3:
| site | binding_mean_E2 | binding_mean_E3 | binding_diff | |
|---|---|---|---|---|
| 440 | 530 | -3.8780 | -0.6065 | 3.0575 |
| 496 | 588 | -3.6705 | -0.2970 | 3.0010 |
| 406 | 491 | -3.7770 | -0.4940 | 2.9905 |
| 416 | 505 | -3.8970 | -0.7710 | 2.8970 |
| 408 | 494 | -4.0580 | -0.6220 | 2.5440 |
Make plots showing correlation between binding and entry for EFNB2 and EFNB3¶
In [12]:
def plot_corr_binding_entry_updated(df, flag):
# Define interactive selectors for variant selection.
# 'variant_selector' is for selecting individual variants based on 'site' and 'mutant'.
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
)
# 'variant_selector_agg' is for aggregated selection based on 'site' only.
variant_selector_agg = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
# Initialize an empty list to store charts for each unique selection in 'df'.
empty_chart = []
# Loop through each unique cell selection in the DataFrame.
for cell in list(df["selection"].unique()):
# Filter DataFrame for the current selection.
tmp_df = df[df["selection"] == cell]
# Check if aggregation flag is True to aggregate data.
if flag == True:
# Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
agg_df = (
tmp_df.groupby("site")[["binding_mean", "effect"]].sum().reset_index()
)
# Create a chart using aggregated data with specified encodings.
chart = (
alt.Chart(agg_df)
.mark_point(stroke="black", filled=True) # Use filled points with black stroke.
.encode(
x=alt.X(
"effect",
title=f"Summed {cell} Cell Entry",
axis=alt.Axis(grid=True),
),
y=alt.Y(
"binding_mean",
title=f"Summed {cell} Binding",
axis=alt.Axis(grid=True),
),
# Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
opacity=alt.condition(
variant_selector_agg, alt.value(1), alt.value(0.2)
),
size=alt.condition(
variant_selector_agg, alt.value(100), alt.value(50)
),
strokeWidth=alt.condition(
variant_selector_agg, alt.value(1), alt.value(0)
),
color=alt.condition(
variant_selector_agg, alt.value("orange"), alt.value("black")
),
tooltip=["site", "binding_mean", "effect"],
)
.add_params(variant_selector_agg)
)
# Add the chart to the list.
empty_chart.append(chart)
else:
# Create a chart for individual data points with specified encodings.
chart = (
alt.Chart(tmp_df)
.mark_point(stroke="black", filled=True)
.encode(
x=alt.X(
"effect", title=f"{cell} Cell Entry", axis=alt.Axis(grid=True)
),
y=alt.Y(
"binding_mean",
title=f"{cell} Binding",
axis=alt.Axis(grid=True),
),
# Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
opacity=alt.condition(
variant_selector, alt.value(1), alt.value(0.1)
),
size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
strokeWidth=alt.condition(
variant_selector, alt.value(1), alt.value(0)
),
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
tooltip=[
"site",
"wildtype",
"mutant",
"binding_mean",
"times_seen_binding",
"effect",
],
)
.add_params(variant_selector)
)
# Add the chart to the list.
empty_chart.append(chart)
# Combine all charts horizontally with a title.
combined_chart = alt.hconcat(
*empty_chart, title=alt.Title("Correlation between binding and entry")
)
# Return the combined chart for display or further use.
return combined_chart
# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()
# Save the plot
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_plot.save(entry_binding_combined_corr_plot)
# Repeat for aggregated data.
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat, True)
entry_binding_corr_plot_agg.display()
# Save the aggregated plot
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_plot_agg.save(entry_binding_combined_corr_plot_agg)
In [13]:
def plot_corr_binding_entry_updated(df, flag):
# Define interactive selectors for variant selection.
# 'variant_selector' is for selecting individual variants based on 'site' and 'mutant'.
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
)
# 'variant_selector_agg' is for aggregated selection based on 'site' only.
variant_selector_agg = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
# Initialize an empty list to store charts for each unique selection in 'df'.
empty_chart = []
# Loop through each unique cell selection in the DataFrame.
for cell in list(df["selection"].unique()):
# Filter DataFrame for the current selection.
tmp_df = df[df["selection"] == cell]
# Check if aggregation flag is True to aggregate data.
if flag == True:
# Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
agg_df = (
tmp_df.groupby("site")[["binding_mean", "effect"]].sum().reset_index()
)
# Create a chart using aggregated data with specified encodings.
chart = (
alt.Chart(agg_df)
.mark_point(stroke="black", filled=True) # Use filled points with black stroke.
.encode(
x=alt.X(
"effect",
title=f"Summed {cell} Cell Entry",
axis=alt.Axis(grid=True),
),
y=alt.Y(
"binding_mean",
title=f"Summed {cell} Binding",
axis=alt.Axis(grid=True),
),
# Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
opacity=alt.condition(
variant_selector_agg, alt.value(1), alt.value(0.2)
),
size=alt.condition(
variant_selector_agg, alt.value(100), alt.value(50)
),
strokeWidth=alt.condition(
variant_selector_agg, alt.value(1), alt.value(0)
),
color=alt.condition(
variant_selector_agg, alt.value("orange"), alt.value("black")
),
tooltip=["site", "binding_mean", "effect"],
)
.add_params(variant_selector_agg)
)
# Add the chart to the list.
empty_chart.append(chart)
else:
# Create a chart for individual data points with specified encodings.
chart = (
alt.Chart(tmp_df)
.mark_point(stroke="black", filled=True)
.encode(
x=alt.X(
"effect", title=f"{cell} Cell Entry", axis=alt.Axis(grid=True)
),
y=alt.Y(
"binding_mean",
title=f"{cell} Binding",
axis=alt.Axis(grid=True),
),
# Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
opacity=alt.condition(
variant_selector, alt.value(1), alt.value(0.1)
),
size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
strokeWidth=alt.condition(
variant_selector, alt.value(1), alt.value(0)
),
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
tooltip=[
"site",
"wildtype",
"mutant",
"binding_mean",
"times_seen_binding",
"effect",
],
)
.add_params(variant_selector)
)
# Add the chart to the list.
empty_chart.append(chart)
# Combine all charts horizontally with a title.
combined_chart = alt.hconcat(
*empty_chart, title=alt.Title("Correlation between binding and entry")
)
# Return the combined chart for display or further use.
return combined_chart
# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()
Same plot as above, but slightly different format¶
In [14]:
def plot_entry_binding_corr_heatmap(df):
empty_chart = []
for cell in list(df["selection"].unique()):
tmp_df = df[df["selection"] == cell]
chart = (
alt.Chart(tmp_df, title=f"{cell}")
.mark_rect()
.encode(
x=alt.X(
"effect", title="Cell Entry", axis=alt.Axis(values=[-2, -1, 0, 1])
).bin(maxbins=60),
y=alt.Y(
"binding_mean",
title="Binding",
axis=alt.Axis(values=[-4, -2, 0, 2]),
).bin(maxbins=60),
color=alt.Color("count()", title="Count").scale(scheme="greenblue"),
)
)
empty_chart.append(chart)
combined_chart = alt.hconcat(
*empty_chart, title=alt.Title("Correlation between binding and entry")
).resolve_scale(y="shared", x="shared", color="shared")
return combined_chart
entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_heat.save(entry_binding_corr_heatmap)
Calculate some stats on binding¶
In [15]:
def overall_stats(df, effect, name):
# Now group sites and find sites where all mutants are deleterious
filtered_df = df.groupby("site").filter(lambda group: (group[effect] < -0.25).all())
# Which sites are these?
unique = filtered_df["site"].unique()
# Convert unique to a Pandas Series
unique_series = pd.Series(unique)
# Find the common elements that are also contact sites
unique_contact_bool = unique_series.isin(config["contact_sites"])
# Filter and get the common elements
common_elements = unique_series[unique_contact_bool]
print(f"The dataset we are analyzing is: {name}\n")
# Print the common elements
print(
f"Here are the contact sites that only have negative binding scores: {list(common_elements)}\n"
)
print(f"There are {len(unique)} sites with all negative binding score mutants\n")
print(
f"These are the sites with all negative binding score mutants: {list(unique)}\n"
)
# Now find sites with low and high binding (median)
median_df = (
df.groupby("site")["binding_mean"]
.max()
.reset_index()
.sort_values(by="binding_mean", ascending=False)
)
print("These are the sites with the highest binding mutants:\n")
display(median_df.head(5))
# Now calculate mutant number
total_mutants = df.shape[0]
mutants_above_cutoff_tolerated = df[df["effect"] > 0]
mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[
["site", "effect", "binding_mean", "wildtype", "mutant"]
]
total_sites = df["site"].unique().shape[0]
print(f"The total number of sites are: {total_sites}")
overall_stats(df_E2_filter, "binding_mean", "EFNB2")
overall_stats(df_E3_filter, "binding_mean", "EFNB3")
The dataset we are analyzing is: EFNB2 Here are the contact sites that only have negative binding scores: [242, 389, 488, 490, 491, 501, 504, 505, 531, 532, 533, 557, 579, 581, 588] There are 41 sites with all negative binding score mutants These are the sites with all negative binding score mutants: [100, 116, 220, 236, 238, 242, 243, 248, 346, 351, 352, 389, 390, 400, 435, 438, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590] These are the sites with the highest binding mutants:
| site | binding_mean | |
|---|---|---|
| 474 | 566 | 2.160 |
| 487 | 580 | 2.152 |
| 250 | 331 | 2.144 |
| 188 | 269 | 2.075 |
| 493 | 586 | 1.962 |
The total number of sites are: 510 The dataset we are analyzing is: EFNB3 Here are the contact sites that only have negative binding scores: [389, 488, 501, 505, 531, 532] There are 15 sites with all negative binding score mutants These are the sites with all negative binding score mutants: [108, 352, 389, 467, 486, 488, 494, 495, 497, 501, 505, 510, 531, 532, 584] These are the sites with the highest binding mutants:
| site | binding_mean | |
|---|---|---|
| 490 | 589 | 2.006 |
| 482 | 580 | 1.337 |
| 189 | 270 | 1.329 |
| 186 | 267 | 1.291 |
| 405 | 492 | 1.261 |
The total number of sites are: 504
Find sites with opposite effects on binding¶
In [16]:
# find sites that are different
def find_biggest_differences(df):
efnb2_good_efnb3_bad = df[
(df["binding_mean_E2"] > 0.5) & (df["binding_mean_E3"] < -0.5)
].sort_values(by="binding_mean_E2", ascending=False)
display(efnb2_good_efnb3_bad)
efnb2_bad_efnb3_good = df[
(df["binding_mean_E2"] < -0.5) & (df["binding_mean_E3"] > 0.5)
].sort_values(by="binding_mean_E3", ascending=False)
display(efnb2_bad_efnb3_good)
find_biggest_differences(df_binding_effect_merge)
| site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | binding_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6042 | 546 | L | H | 1.523 | 1.145 | 5.25 | -1.148 | 0.387 | 6.750 | 1.0 | -0.543 | 0.665 | 3.5 | -0.851 | 0.544 | 5.143 | 1.0 | 2.066 |
| 2621 | 303 | P | C | 1.345 | 1.577 | 5.00 | -0.718 | 0.590 | 5.000 | 1.0 | -0.566 | 0.098 | 4.0 | -0.697 | 0.594 | 4.857 | 1.0 | 1.911 |
| 122 | 79 | N | S | 1.292 | 1.652 | 3.00 | -0.635 | 0.496 | 2.375 | 1.0 | -0.646 | 0.260 | 3.0 | -0.129 | 0.569 | 3.429 | 1.0 | 1.938 |
| 103 | 78 | D | I | 0.659 | 0.634 | 5.25 | -0.931 | 0.398 | 6.500 | 1.0 | -0.566 | 0.755 | 5.0 | -0.517 | 0.845 | 5.143 | 1.0 | 1.225 |
| 1682 | 224 | M | Q | 0.647 | 0.880 | 3.50 | 0.166 | 0.315 | 2.750 | 1.0 | -0.581 | 0.381 | 3.5 | 0.206 | 0.292 | 4.286 | 1.0 | 1.228 |
| 5708 | 520 | I | G | 0.591 | 1.013 | 5.00 | -1.465 | 0.349 | 8.000 | 1.0 | -0.530 | 0.148 | 5.5 | -1.309 | 0.319 | 7.000 | 1.0 | 1.121 |
| 1032 | 178 | V | T | 0.529 | 0.726 | 3.75 | -0.528 | 0.759 | 4.250 | 1.0 | -0.513 | 0.600 | 3.0 | -0.019 | 0.425 | 3.571 | 1.0 | 1.042 |
| site | wildtype | mutant | binding_mean_E2 | binding_std_E2 | times_seen_binding_E2 | effect_E2 | effect_std_E2 | times_seen_cell_entry_E2 | frac_models_E2 | binding_mean_E3 | binding_std_E3 | times_seen_binding_E3 | effect_E3 | effect_std_E3 | times_seen_cell_entry_E3 | frac_models_E3 | binding_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6628 | 588 | I | P | -2.349 | 0.840 | 5.25 | -0.256 | 0.300 | 4.750 | 1.0 | 1.183 | 0.255 | 5.5 | -0.613 | 0.382 | 5.429 | 1.0 | 3.532 |
| 6734 | 598 | P | G | -0.552 | 0.785 | 5.00 | -0.867 | 0.939 | 4.625 | 1.0 | 1.154 | 1.559 | 4.5 | 0.239 | 0.515 | 5.429 | 1.0 | 1.706 |
| 5237 | 479 | W | S | -0.802 | 1.272 | 3.25 | -1.304 | 0.878 | 5.250 | 1.0 | 0.731 | 2.957 | 2.0 | -1.469 | 0.612 | 3.286 | 1.0 | 1.533 |
| 3180 | 338 | R | G | -0.517 | 0.507 | 4.50 | 0.300 | 0.422 | 5.250 | 1.0 | 0.684 | 0.976 | 4.0 | 0.158 | 0.340 | 3.429 | 1.0 | 1.201 |
| 5364 | 491 | S | A | -0.932 | 0.537 | 5.50 | -0.669 | 0.806 | 6.250 | 1.0 | 0.645 | 0.010 | 6.5 | 0.143 | 0.358 | 6.143 | 1.0 | 1.577 |
| 5390 | 492 | Q | R | -2.823 | 0.611 | 19.25 | -0.077 | 0.272 | 23.120 | 1.0 | 0.644 | 0.054 | 20.0 | 0.509 | 0.119 | 20.570 | 1.0 | 3.467 |
Find correlations between EFNB2 and EFNB3 binding¶
In [17]:
def plot_entry_binding_corr(df):
chart = (
alt.Chart(df, title="Correlation Between Mutant Binding Scores")
.mark_rect()
.encode(
x=alt.X(
"binding_mean_E2",
title="EFNB2 binding",
axis=alt.Axis(values=[-5, 0, 2]),
).bin(maxbins=40),
y=alt.Y(
"binding_mean_E3",
title="EFNB3 binding",
axis=alt.Axis(values=[-2, 0, 2]),
).bin(maxbins=40),
color=alt.Color("count()", title="Count").scale(scheme="greenblue"),
)
)
return chart
entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()
if entry_binding_combined_corr_plot is not None:
entry_binding_corr_heatmap_1.save(binding_corr_heatmap)
In [18]:
def plot_affinity_solid(df):
df = df.dropna()
# calculate r value
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
df["binding_mean_E2"], df["binding_mean_E3"]
)
r_value = float(r_value)
# make chart
chart = (
alt.Chart(
df,
title=alt.Title(
"Correlation between Mutant Binding Scores", subtitle=f"r={r_value:.2f}"
),
)
.mark_point(color="black", size=30, opacity=0.2, filled=True)
.encode(
x=alt.X("binding_mean_E2", title=("EFNB2 Binding")),
y=alt.Y("binding_mean_E3", title=("EFNB3 Binding")),
tooltip=[
"site",
"wildtype",
"mutant",
"binding_mean_E2",
"binding_mean_E3",
"effect_E2",
"effect_E3",
],
)
)
min = int(df["binding_mean_E2"].min())
max = int(df["binding_mean_E3"].max())
text = (
alt.Chart({"values": [{"x": min, "y": max, "text": f"r = {r_value:.2f}"}]})
.mark_text(align="left", baseline="top", dx=-10, dy=-20)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
chart_and_text = chart
return chart_and_text
E2_E3_corr = plot_affinity_solid(df_binding_effect_merge)
E2_E3_corr.display()
if entry_binding_combined_corr_plot is not None:
E2_E3_corr.save(E2_E3_correlation)
Plot correlations between summary statistics for each site¶
In [19]:
def plot_affinity_solid_mean(df):
df = df.dropna()
means = (
df.groupby("site")
.agg(
{
"effect_E2": "median",
"effect_E3": "median",
"binding_mean_E2": "median",
"binding_mean_E3": "median",
"wildtype": "first",
}
)
.reset_index()
)
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
means["binding_mean_E2"], means["binding_mean_E3"]
)
r_value = float(r_value)
chart = (
alt.Chart(
means,
title=alt.Title(
"Correlation between Aggregate Mutant Binding Scores",
subtitle=f"r={r_value:.2f}",
),
)
.mark_point(size=50, opacity=0.3)
.encode(
x=alt.X(
"binding_mean_E2",
title=("Median EFNB2 Binding"),
axis=alt.Axis(tickCount=3),
),
y=alt.Y(
"binding_mean_E3",
title=("Median EFNB3 Binding"),
axis=alt.Axis(tickCount=3),
),
tooltip=[
"site",
"wildtype",
"binding_mean_E2",
"binding_mean_E3",
"effect_E2",
"effect_E3",
],
)
)
text = (
alt.Chart({"values": [{"x": -3.5, "y": 0.5, "text": f"r = {r_value:.2f}"}]})
.mark_text(align="left", baseline="top", dx=0, dy=-10)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
chart_and_text = chart
return chart_and_text
E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
if entry_binding_combined_corr_plot is not None:
E2_E3_site_corr.save(E2_E3_correlation_site)
if entry_binding_combined_corr_plot is not None:
(E2_E3_corr | E2_E3_site_corr).save(combined_E2_E3_site_corr)
Make plot showing binding by site (median)¶
In [20]:
def plot_affinity_by_site_median(df):
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
empty_charts = []
for selection in ["binding_mean_E2", "binding_mean_E3"]:
if selection == "binding_mean_E2":
name = "EFNB2 Binding"
else:
name = "EFNB3 Binding"
mean = df.groupby("site")[selection].max().reset_index()
mean = mean[mean[selection] >= 0]
chart = (
alt.Chart(mean)
.mark_point(stroke="black", filled=True, size=50)
.encode(
x=alt.X(
"site",
title=("Site"),
axis=alt.Axis(grid=True, tickCount=4),
scale=alt.Scale(domain=[70, 602]),
),
y=alt.Y(selection, title=(name), axis=alt.Axis(grid=True, tickCount=3)),
tooltip=["site"],
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.5)),
strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
)
.properties(height=150, width=500)
.add_params(variant_selector)
)
empty_charts.append(chart)
combined_chart = alt.vconcat(*empty_charts, spacing=1, title="Max Binding by Site")
return combined_chart
binding_by_site = plot_affinity_by_site_median(df_binding_effect_merge)
binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
binding_by_site.save(binding_by_site_plot)
In [21]:
def plot_affinity_by_contact_site(df, sites_to_show, title_text):
variant_selector = alt.selection_point(
on="mouseover", nearest=True, empty=False, fields=["site"], value=0
)
empty_charts = []
contact_df = df[df["site"].isin(sites_to_show)]
sites = list(contact_df["site"].unique())
for selection in df["selection"].unique():
tmp_df = contact_df[contact_df["selection"] == selection]
mean = tmp_df.groupby("site")["binding_mean"].max().reset_index()
chart = (
alt.Chart(mean)
.mark_point(size=100)
.encode(
x=alt.X(
"site:O",
sort=sites,
title=("Site"),
axis=alt.Axis(grid=True, labelAngle=-90),
scale=alt.Scale(domain=sites),
),
y=alt.Y(
"binding_mean", title=(f"{selection}"), axis=alt.Axis(grid=True)
),
tooltip=["site"],
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
)
.add_params(variant_selector)
)
empty_charts.append(chart)
combined_chart = alt.vconcat(*empty_charts, spacing=1, title=title_text)
return combined_chart
contact_binding_by_site = plot_affinity_by_contact_site(
df_binding_effect_concat, config["contact_sites"], "Max Binding in Contact"
)
contact_binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
contact_binding_by_site.save(max_binding_in_contact)
contact_binding_by_site_stalk = plot_affinity_by_contact_site(
df_binding_effect_concat, list(range(96, 147)), "Max Binding in Stalk"
)
contact_binding_by_site_stalk.display()
if entry_binding_combined_corr_plot is not None:
contact_binding_by_site_stalk.save(max_binding_in_stalk)
Make bubble plots for binding in different areas of receptor pocket¶
In [22]:
def make_boxplot_binding_region(
df, title
): # Create a box plot using Altair for aggregated means
barrel_ranges = {
"Hydrophobic": config["hydrophobic"],
"Salt Bridges": config["salt_bridges"],
"Hydrogen Bonds": config["h_bond_total"],
"Contact": config["contact_sites"],
"Overall": list(range(71, 602)),
}
mean_df = df.groupby("site")[["binding_mean"]].median().reset_index()
custom_order = [
"Hydrophobic",
"Salt Bridges",
"Hydrogen Bonds",
"Contact",
"Overall",
]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"barrel": barrel, "effect": row["binding_mean"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df)
.mark_point(size=50, opacity=0.4)
.encode(
x=alt.X(
"barrel:O", sort=custom_order, title=None, axis=alt.Axis(labelAngle=-90)
),
y=alt.Y(
"effect",
title=f"Median {title} Binding",
axis=alt.Axis(grid=True, tickCount=4),
),
xOffset="random:Q",
tooltip=["barrel", "effect", "site"],
)
.transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
)
return chart.display()
make_boxplot_binding_region(df_E2_filter, "EFNB2")
make_boxplot_binding_region(df_E3_filter, "EFNB3")
make boxplot of binding scores by region¶
In [23]:
def make_boxplot_binding_region(df):
barrel_ranges = {
"Stalk": list(range(96, 147)),
"Neck": list(range(148, 165)),
"Linker": list(range(166, 177)),
"Head": list(range(178, 602)),
"Receptor Contact": config["contact_sites"],
"Total": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
for selection in df["selection"].unique():
tmp_df = df[df["selection"] == selection]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{
"region": barrel,
"binding_mean": row["binding_mean"],
"site": row["site"],
}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_boxplot(color="darkgray", extent="min-max", opacity=1)
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"binding_mean",
title=f"Binding",
axis=alt.Axis(grid=True, tickCount=4),
),
tooltip=["region", "binding_mean", "site"],
)
.properties(width=config["width"], height=config["height"])
)
empty_charts.append(chart)
combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
y="shared", x="shared", color="independent"
)
return combined_effect_chart
entry_region_boxplot = make_boxplot_binding_region(df_binding_effect_concat)
entry_region_boxplot.display()
if entry_binding_combined_corr_plot is not None:
entry_region_boxplot.save(binding_region_boxplot_plot)
In [24]:
def make_bubble_binding_region(df):
barrel_ranges = {
"Stalk": list(range(96, 147)),
"Neck": list(range(148, 165)),
"Linker": list(range(166, 177)),
"Head": list(range(178, 602)),
"Receptor Contact": config["contact_sites"],
"Total": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
for selection in df["selection"].unique():
tmp_df = df[df["selection"] == selection]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{
"region": barrel,
"binding_mean": row["binding_mean"],
"site": row["site"],
"mutant": row["mutant"],
}
)
agg_means_df = pd.DataFrame(agg_means)
variant_selector = alt.selection_point(
on="mouseover", empty=False, fields=["site", "mutant"], value=1
)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_point(opacity=0.3, stroke="black")
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"binding_mean",
title=f"Binding",
axis=alt.Axis(grid=True, tickCount=4),
),
xOffset="random:Q",
tooltip=["region", "binding_mean", "site", "mutant"],
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
size=alt.condition(variant_selector, alt.value(50), alt.value(15)),
)
.transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
.properties(width=config["width"], height=config["height"])
).add_params(variant_selector)
empty_charts.append(chart)
combined_effect_chart = (
alt.hconcat(*empty_charts)
.resolve_scale(y="shared", x="shared", color="independent")
.add_params(variant_selector)
)
return combined_effect_chart
entry_region_bubble = make_bubble_binding_region(df_binding_effect_concat)
entry_region_bubble.display()
if entry_binding_combined_corr_plot is not None:
entry_region_bubble.save(binding_region_bubble_plot)
In [ ]: